# Load any necessary packages here
pkgTest("tidyr")
## Loading required package: tidyr
pkgTest("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pkgTest("purrr")
## Loading required package: purrr
pkgTest("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ readr 1.1.1
## ✔ tibble 1.3.4 ✔ stringr 1.2.0
## ✔ ggplot2 2.2.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
pkgTest("broom")
## Loading required package: broom
pkgTest("hashmap")
## Loading required package: hashmap
pkgTest("leaps")
## Loading required package: leaps
pkgTest("MASS")
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tidyr)
library(dplyr)
library(purrr)
library(tidyverse)
library(broom)
library(hashmap)
library(leaps)
library(MASS)
set.seed(20190908)
# read in birth data with NAs
o_data = read.csv("data/Yr1116Birth.csv", na.strings=c("9","99", "9999"))
# Helper functions
mean_sd_txt = function(x, digits = 0) {
paste0(round(mean(x), digits = digits), "±", round(sd(x), digits = digits))
}
# remove missing rows
birth_data = na.omit(o_data)
# convert to categorical variables
# SEX, CORES(residence), MRACER(race), MHISP(Hispanic origin)
birth_data = birth_data %>% mutate(
YOB = as.factor(YOB),
SEX = as.factor(SEX),
CORES = as.factor(CORES),
MRACER = as.factor(MRACER),
MHISP = as.factor(MHISP),
)
# function to convert numeric values to two-level categorical variables
# based on threshold
transform = function(data, cols, threshold) {
for (i in 1:length(cols)) {
col = cols[i]
data[paste0(col, 'BIN')] = as.factor(
case_when(
data[col] <= threshold ~ 0,
data[col] > threshold ~ 1,
)
)
}
return(data)
}
# convert average #cig to categorical: smoke or not smoke
birth_data = transform(birth_data, c("CIGPN", "CIGFN", "CIGSN", "CIGLN"), 0)
# convert #pregancy to categorical: pregnant before or never
birth_data = transform(birth_data, c("PARITY"), 1)
birth_data$SMOKER = (as.numeric(birth_data$CIGPNBIN) - 1) | # Before pregnancy
(as.numeric(birth_data$CIGPNBIN) - 1) | # First trimester
(as.numeric(birth_data$CIGPNBIN) - 1) | # Second trimester
(as.numeric(birth_data$CIGPNBIN) - 1) # Last trimester
# Combine all low frequency groups to other
birth_data$RACE = as.character(birth_data$MRACER)
birth_data$RACE[birth_data$RACE == "0"] = "O"
birth_data$RACE[birth_data$RACE == "1"] = "W"
birth_data$RACE[birth_data$RACE == "2"] = "B"
birth_data$RACE[birth_data$RACE == 3] = "O" # Native americans -> other
birth_data$RACE[birth_data$RACE == 4] = "A"
birth_data$RACE[birth_data$RACE == 5] = "A"
birth_data$RACE[birth_data$RACE == 6] = "A"
birth_data$RACE[birth_data$RACE == 7] = "A"
birth_data$RACE[birth_data$RACE == 8] = "A"
birth_data$RACE = as.factor(birth_data$RACE)
birth_data$RACE = relevel(birth_data$RACE, ref="W")
# 0 - Other non-White
# 1 - White
# 2 - Black or African American
# 3 - American Indian or Alaska Native
# 4 - Chinese
# 5 - Japanese
# 6 - Native Hawaiian
# 7 - Filipino
# 8 - Other Asian
# Separate hispanic and nonhispanic
birth_data$HISP = as.character(birth_data$MHISP)
birth_data$HISP[birth_data$HISP == "C"] = "Y"
birth_data$HISP[birth_data$HISP == "M"] = "Y"
birth_data$HISP[birth_data$HISP == "N"] = "N"
birth_data$HISP[birth_data$HISP == "O"] = "Y"
birth_data$HISP[birth_data$HISP == "P"] = "Y"
birth_data$HISP[birth_data$HISP == "S"] = "Y"
birth_data$HISP[birth_data$HISP == "U"] = "Y"
birth_data$HISP = birth_data$HISP == "Y"
birth_data = birth_data %>% filter(YOB == 2011)
glimpse(birth_data)
## Observations: 118,315
## Variables: 22
## $ YOB <fctr> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 201...
## $ SEX <fctr> 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, ...
## $ CORES <fctr> 78, 70, 11, 54, 25, 60, 13, 79, 10, 4, 49, 12, 1, 2...
## $ CIGPN <int> 0, 0, 5, 0, 10, 0, 0, 0, 0, 0, 4, 0, 20, 20, 0, 0, 0...
## $ CIGFN <int> 0, 0, 5, 0, 10, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0,...
## $ CIGSN <int> 0, 0, 5, 0, 10, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0,...
## $ CIGLN <int> 0, 0, 5, 0, 10, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0,...
## $ BWTG <int> 3062, 2977, 2549, 4309, 2835, 2837, 4032, 3590, 3090...
## $ GEST <int> 37, 39, 38, 40, 38, 38, 39, 39, 41, 39, 38, 41, 39, ...
## $ PLUR <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ MAGE <int> 20, 23, 21, 36, 22, 20, 26, 25, 27, 28, 22, 23, 28, ...
## $ MRACER <fctr> 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 2, 8, 1, 1, 2, 1, 1, ...
## $ MHISP <fctr> N, N, N, M, N, N, N, N, N, N, N, M, N, N, N, N, N, ...
## $ PARITY <int> 3, 1, 1, 3, 5, 1, 1, 2, 6, 6, 4, 3, 3, 1, 2, 1, 1, 3...
## $ CIGPNBIN <fctr> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, ...
## $ CIGFNBIN <fctr> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ CIGSNBIN <fctr> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ CIGLNBIN <fctr> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ PARITYBIN <fctr> 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...
## $ SMOKER <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE...
## $ RACE <fctr> B, B, W, O, W, W, W, W, W, W, B, A, W, W, B, W, W, ...
## $ HISP <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALS...
[analysis: unnatural gestational time(80 weeks) found in the plot -> record error -> remove data point; MORE analysis based on the plots]
ggplot(birth_data, aes(GEST, BWTG)) +
geom_point(aes(col=SMOKER), size=1, shape=2) +
facet_wrap(~SMOKER, labeller = labeller(SMOKER=c("TRUE"="smoker", "FALSE"="non-smoker"))) +
ggtitle("Birth Weight vs. Gestation Time for Non-smokers and Smokers")
ggplot(birth_data, aes(x=BWTG)) +
geom_histogram(aes(y = ..density.., fill=SMOKER), color="grey", alpha=0.7) +
stat_function(fun = dnorm, args = list(mean = mean(birth_data$BWTG), sd = sd(birth_data$BWTG))) +
geom_vline(xintercept = mean(birth_data$BWTG), linetype="dashed") +
facet_wrap(~SMOKER, labeller = labeller(
SMOKER = c(
"TRUE"=paste("Smoker ", mean_sd_txt((birth_data %>% filter(SMOKER == TRUE))$BWTG)),
"FALSE"=paste("Non-smoker", mean_sd_txt((birth_data %>% filter(SMOKER == FALSE))$BWTG))))) +
ggtitle("Birth Weight Distribution for Non-smokers and Smokers")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
A normal pregnancy ranges from 38 to 42 weeks. (Source: https://medlineplus.gov/ency/article/002367.htm)
We note a gestational age nearly double that time at 80. We will remove the entry with that value since we consider it a clerical error.
# remove the data point with more than 80 weeks gestational age
birth_data = birth_data %>% filter(GEST < 50)
Next we can examine the data to determine if there any temporal changes of merit. That is, do we see a difference in any of the covariates as we vary year of birth?
birth_data %>% group_by(YOB) %>% summarize(
GEST = mean_sd_txt(GEST, 1),
BWTG = mean_sd_txt(BWTG, 1),
SMOKER = round(mean(SMOKER == TRUE), 2),
SEX = round(mean(SEX == 1), 2),
MAGE = mean_sd_txt(MAGE, 1),
PLUR = mean_sd_txt(PLUR, 1),
PARITY = mean_sd_txt(PARITY, 1),
MRACER0 = round(mean(MRACER == 0), 3),
MRACER1 = round(mean(MRACER == 1), 3),
MRACER2 = round(mean(MRACER == 2), 3),
MRACER3 = round(mean(MRACER == 3), 3),
MRACER4 = round(mean(MRACER == 4), 3),
)
## # A tibble: 1 x 13
## YOB GEST BWTG SMOKER SEX MAGE PLUR PARITY MRACER0
## <fctr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2011 38.5±2.3 3259.8±618 0.14 0.51 27.4±6 1±0.2 2.4±1.6 0.124
## # ... with 4 more variables: MRACER1 <dbl>, MRACER2 <dbl>, MRACER3 <dbl>,
## # MRACER4 <dbl>
Note that when inspecting the mean and variance covariates by year of birth, the values remain relatively stable aside from smoker status which decreases slightly by a few percentage points. This suggests that the percentage of the population that smokes is dropping over time across the other covariates. As such, we believe that there is not a significant time-related trend and that year of birth is not a covariate of interest.
This is a full model using binarized smoking and parity covariates courtesy of Naya.
fullmodel_naya = lm(BWTG~SEX + CORES + CIGPNBIN + CIGFNBIN + CIGSNBIN + CIGLNBIN + GEST + PLUR + MAGE + MRACER + MHISP + PARITYBIN, data=birth_data)
summary(fullmodel_naya)
##
## Call:
## lm(formula = BWTG ~ SEX + CORES + CIGPNBIN + CIGFNBIN + CIGSNBIN +
## CIGLNBIN + GEST + PLUR + MAGE + MRACER + MHISP + PARITYBIN,
## data = birth_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3523.9 -278.2 -18.6 256.1 4140.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.202e+03 4.079e+01 -78.482 < 2e-16 ***
## SEX2 -1.205e+02 2.466e+00 -48.862 < 2e-16 ***
## CORES2 -1.202e+01 2.600e+01 -0.462 0.643784
## CORES3 1.433e+01 5.072e+01 0.283 0.777535
## CORES4 -7.010e+01 2.737e+01 -2.561 0.010426 *
## CORES5 -6.295e+01 2.990e+01 -2.105 0.035288 *
## CORES6 -3.443e+01 3.707e+01 -0.929 0.352939
## CORES7 1.635e+01 2.183e+01 0.749 0.453982
## CORES8 -2.574e+00 3.205e+01 -0.080 0.935996
## CORES10 6.268e+00 1.679e+01 0.373 0.708971
## CORES11 -5.419e+01 1.341e+01 -4.041 5.33e-05 ***
## CORES12 -2.290e+01 1.784e+01 -1.284 0.199265
## CORES13 -2.868e+01 1.368e+01 -2.096 0.036093 *
## CORES14 -6.360e+01 1.826e+01 -3.483 0.000497 ***
## CORES15 -2.079e+01 5.861e+01 -0.355 0.722745
## CORES16 6.051e+00 1.987e+01 0.304 0.760773
## CORES17 -6.660e+01 3.254e+01 -2.047 0.040696 *
## CORES18 -2.668e+01 1.450e+01 -1.839 0.065875 .
## CORES19 3.016e-04 1.971e+01 0.000 0.999988
## CORES20 4.173e+01 2.996e+01 1.393 0.163602
## CORES21 3.420e+01 3.423e+01 0.999 0.317799
## CORES22 7.772e+00 5.102e+01 0.152 0.878943
## CORES23 6.804e+00 1.660e+01 0.410 0.681910
## CORES24 -1.751e+01 1.964e+01 -0.892 0.372647
## CORES25 1.532e+01 1.479e+01 1.036 0.300313
## CORES26 2.205e+01 1.179e+01 1.871 0.061322 .
## CORES27 6.890e+01 4.173e+01 1.651 0.098773 .
## CORES28 4.875e+01 2.599e+01 1.876 0.060692 .
## CORES29 1.051e+01 1.458e+01 0.721 0.471124
## CORES30 -2.991e+01 2.419e+01 -1.236 0.216327
## CORES31 -1.046e+01 1.830e+01 -0.572 0.567621
## CORES32 1.570e+00 1.233e+01 0.127 0.898626
## CORES33 -2.198e+01 1.968e+01 -1.117 0.263937
## CORES34 -2.065e+01 1.216e+01 -1.698 0.089578 .
## CORES35 2.622e+01 1.960e+01 1.338 0.180970
## CORES36 -3.942e+01 1.338e+01 -2.945 0.003231 **
## CORES37 -1.960e+01 7.045e+01 -0.278 0.780836
## CORES38 -2.996e+01 4.521e+01 -0.663 0.507475
## CORES39 1.973e+01 2.086e+01 0.946 0.344205
## CORES40 -1.490e+01 2.977e+01 -0.501 0.616627
## CORES41 -2.276e+00 1.179e+01 -0.193 0.846915
## CORES42 1.418e+01 2.069e+01 0.685 0.493288
## CORES43 -9.035e+00 1.454e+01 -0.621 0.534288
## CORES44 -2.786e+01 2.063e+01 -1.350 0.176943
## CORES45 -2.425e+01 1.669e+01 -1.453 0.146339
## CORES46 2.816e+00 3.075e+01 0.092 0.927036
## CORES47 1.644e+01 1.703e+01 0.965 0.334448
## CORES48 -1.145e+01 6.205e+01 -0.185 0.853557
## CORES49 -9.839e+00 1.455e+01 -0.676 0.498976
## CORES50 2.496e+01 2.396e+01 1.042 0.297473
## CORES51 2.468e+01 1.371e+01 1.801 0.071765 .
## CORES52 -2.212e+01 4.038e+01 -0.548 0.583842
## CORES53 -3.349e+00 1.811e+01 -0.185 0.853263
## CORES54 2.157e+01 1.960e+01 1.100 0.271124
## CORES55 -7.994e+00 1.831e+01 -0.437 0.662415
## CORES56 2.238e+01 2.261e+01 0.990 0.322175
## CORES57 -1.408e+01 2.628e+01 -0.536 0.592065
## CORES58 -7.311e+01 3.311e+01 -2.208 0.027261 *
## CORES59 5.374e+01 2.950e+01 1.822 0.068451 .
## CORES60 -3.449e+01 1.106e+01 -3.119 0.001814 **
## CORES61 -9.989e+01 3.782e+01 -2.641 0.008270 **
## CORES62 9.430e+00 2.494e+01 0.378 0.705291
## CORES63 2.459e+01 1.722e+01 1.428 0.153389
## CORES64 4.725e+00 1.659e+01 0.285 0.775755
## CORES65 -3.532e+01 1.375e+01 -2.569 0.010206 *
## CORES66 -2.813e+01 3.473e+01 -0.810 0.417923
## CORES67 5.112e+00 1.226e+01 0.417 0.676802
## CORES68 -2.722e+01 1.574e+01 -1.729 0.083801 .
## CORES69 -8.524e+01 4.612e+01 -1.848 0.064539 .
## CORES70 -2.123e+01 2.310e+01 -0.919 0.357998
## CORES71 -2.151e+01 2.006e+01 -1.072 0.283740
## CORES72 2.241e+01 4.038e+01 0.555 0.579038
## CORES73 2.776e+01 2.374e+01 1.169 0.242223
## CORES74 -2.276e+00 1.390e+01 -0.164 0.869968
## CORES75 7.638e+01 3.946e+01 1.935 0.052936 .
## CORES76 -2.676e+01 1.500e+01 -1.784 0.074412 .
## CORES77 1.423e+01 2.024e+01 0.703 0.482192
## CORES78 -7.811e+01 1.513e+01 -5.164 2.42e-07 ***
## CORES79 -2.672e+01 1.776e+01 -1.504 0.132477
## CORES80 -1.157e+00 1.503e+01 -0.077 0.938623
## CORES81 -1.981e+01 1.915e+01 -1.035 0.300743
## CORES82 2.837e+01 1.804e+01 1.573 0.115789
## CORES83 -3.675e+01 2.258e+01 -1.627 0.103730
## CORES84 1.418e+00 1.976e+01 0.072 0.942792
## CORES85 -3.277e+00 2.375e+01 -0.138 0.890288
## CORES86 -1.417e+00 1.852e+01 -0.077 0.939020
## CORES87 1.018e+02 3.242e+01 3.141 0.001684 **
## CORES88 -7.018e+01 2.824e+01 -2.485 0.012950 *
## CORES89 -7.163e-01 6.474e+01 -0.011 0.991172
## CORES90 -2.393e+01 1.358e+01 -1.762 0.078103 .
## CORES91 -1.830e+01 2.072e+01 -0.883 0.377139
## CORES92 9.330e+00 1.112e+01 0.839 0.401431
## CORES93 1.634e+01 3.471e+01 0.471 0.637909
## CORES94 5.971e+01 3.744e+01 1.595 0.110745
## CORES95 -1.964e+01 2.469e+01 -0.795 0.426488
## CORES96 2.004e+01 1.459e+01 1.373 0.169783
## CORES97 1.230e+01 1.945e+01 0.632 0.527079
## CORES98 3.248e+00 1.700e+01 0.191 0.848452
## CORES100 3.839e+01 3.425e+01 1.121 0.262330
## CIGPNBIN1 -5.734e+00 6.437e+00 -0.891 0.373072
## CIGFNBIN1 -5.416e+01 1.053e+01 -5.143 2.71e-07 ***
## CIGSNBIN1 -6.791e+01 1.505e+01 -4.512 6.41e-06 ***
## CIGLNBIN1 -1.067e+02 1.326e+01 -8.046 8.63e-16 ***
## GEST 1.749e+02 5.686e-01 307.494 < 2e-16 ***
## PLUR -3.480e+02 6.597e+00 -52.757 < 2e-16 ***
## MAGE 5.619e+00 2.309e-01 24.332 < 2e-16 ***
## MRACER1 6.209e+01 7.937e+00 7.823 5.20e-15 ***
## MRACER2 -1.307e+02 8.331e+00 -15.694 < 2e-16 ***
## MRACER3 -1.945e-01 1.400e+01 -0.014 0.988917
## MRACER4 -5.513e+01 2.022e+01 -2.727 0.006396 **
## MRACER5 -1.311e+02 4.067e+01 -3.222 0.001271 **
## MRACER6 -3.427e+01 1.135e+02 -0.302 0.762798
## MRACER7 -6.792e+01 2.426e+01 -2.800 0.005114 **
## MRACER8 -1.183e+02 1.076e+01 -10.998 < 2e-16 ***
## MHISPM -6.082e+01 3.010e+01 -2.021 0.043314 *
## MHISPN -4.061e+01 2.981e+01 -1.362 0.173050
## MHISPO -6.831e+01 3.274e+01 -2.087 0.036909 *
## MHISPP -7.275e+01 3.204e+01 -2.271 0.023165 *
## MHISPS -7.790e+01 3.056e+01 -2.549 0.010802 *
## MHISPU -7.241e+01 6.745e+01 -1.074 0.283031
## PARITYBIN1 1.033e+02 2.792e+00 36.989 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 423.8 on 118193 degrees of freedom
## Multiple R-squared: 0.5302, Adjusted R-squared: 0.5297
## F-statistic: 1112 on 120 and 118193 DF, p-value: < 2.2e-16
fullmodel_naya = broom::tidy(fullmodel_naya)
Since full model has too many variables, we drop the variables with p-values larger than a threshold value as the first step. We set the threshold value to 0.01 here.
# select variables with p-value below a threshold value
pthreshold = 0.01
used_index = which(fullmodel_naya$p.value[-1] < pthreshold)
used_vars = fullmodel_naya$term[used_index]
get.used.data = function(used_vars) {
levels_map = list()
for (var in used_vars) {
level = str_extract(var, "[0-9]+")
if (!is.na(level)) {
original = sub("[0-9]+", "", var)
if (is.null(levels_map[[original]])) {
levels_map[[original]] = c()
}
levels_map[[original]] = c(levels_map[[original]], level)
}
if (!is.na(str_extract(var, "MHISP"))) {
if (is.null(levels_map[["MHISP"]])) {
levels_map[["MHISP"]] = c()
}
levels_map[["MHISP"]] = sub("MHISP([M|N|O|P|S|U])", "\\1", var)
}
}
original_vars = unique(sub("MHISP.*", "MHISP", sub("[0-9]+", "", used_vars)))
used_data = birth_data[, which(names(birth_data) %in% c(original_vars, "BWTG"))]
for (colname in colnames(used_data)) {
# select levels with p-value larger than the threshold
if (is.factor(used_data[[colname]])) {
used_data[[colname]] = factor(used_data[[colname]], levels = c(levels(birth_data[[colname]])[1], levels_map[[colname]]))
used_data[[colname]][is.na(used_data[[colname]])] = levels(birth_data[[colname]])[1]
}
}
return(used_data)
}
used_data = get.used.data(used_vars)
model2_naya = lm(BWTG ~ ., data = used_data)
summary(model2_naya)
##
## Call:
## lm(formula = BWTG ~ ., data = used_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3406.6 -284.7 -20.4 261.9 4198.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3399.3561 25.9001 -131.249 < 2e-16 ***
## CORES10 13.2596 13.5040 0.982 0.3261
## CORES13 -16.0433 9.1583 -1.752 0.0798 .
## CORES35 34.3973 16.9692 2.027 0.0427 *
## CORES59 49.8154 28.1664 1.769 0.0770 .
## CORES60 -39.4350 4.0017 -9.854 < 2e-16 ***
## CORES77 19.1318 17.7515 1.078 0.2811
## CORES86 13.6330 15.6588 0.871 0.3840
## GEST 174.5119 0.5774 302.245 < 2e-16 ***
## PLUR -345.2442 6.7165 -51.402 < 2e-16 ***
## MAGE 8.6946 0.2153 40.389 < 2e-16 ***
## MRACER1 142.4741 2.6650 53.462 < 2e-16 ***
## MRACER3 66.9568 10.7803 6.211 5.28e-10 ***
## MRACER4 12.5052 19.0069 0.658 0.5106
## MRACER6 53.2825 115.5262 0.461 0.6446
## MRACER7 13.3399 23.4825 0.568 0.5700
## MHISPU -31.6444 61.7634 -0.512 0.6084
## CIGPNBIN1 -5.6757 6.5104 -0.872 0.3833
## CIGFNBIN1 -53.0935 10.7280 -4.949 7.47e-07 ***
## CIGSNBIN1 -64.0416 15.3356 -4.176 2.97e-05 ***
## CIGLNBIN1 -102.2734 13.5050 -7.573 3.67e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432.1 on 118293 degrees of freedom
## Multiple R-squared: 0.5111, Adjusted R-squared: 0.511
## F-statistic: 6183 on 20 and 118293 DF, p-value: < 2.2e-16
step(model2_naya, direction = "backward")
## Start: AIC=1436049
## BWTG ~ CORES + GEST + PLUR + MAGE + MRACER + MHISP + CIGPNBIN +
## CIGFNBIN + CIGSNBIN + CIGLNBIN
##
## Df Sum of Sq RSS AIC
## - MHISP 1 4.9017e+04 2.2089e+10 1436047
## - CIGPNBIN 1 1.4192e+05 2.2089e+10 1436047
## <none> 2.2089e+10 1436049
## - CIGSNBIN 1 3.2564e+06 2.2092e+10 1436064
## - CIGFNBIN 1 4.5736e+06 2.2094e+10 1436071
## - CIGLNBIN 1 1.0709e+07 2.2100e+10 1436104
## - CORES 7 2.0941e+07 2.2110e+10 1436147
## - MAGE 1 3.0460e+08 2.2394e+10 1437667
## - PLUR 1 4.9338e+08 2.2582e+10 1438660
## - MRACER 5 5.3934e+08 2.2628e+10 1438893
## - GEST 1 1.7058e+10 3.9147e+10 1503752
##
## Step: AIC=1436047
## BWTG ~ CORES + GEST + PLUR + MAGE + MRACER + CIGPNBIN + CIGFNBIN +
## CIGSNBIN + CIGLNBIN
##
## Df Sum of Sq RSS AIC
## - CIGPNBIN 1 1.4200e+05 2.2089e+10 1436046
## <none> 2.2089e+10 1436047
## - CIGSNBIN 1 3.2517e+06 2.2092e+10 1436062
## - CIGFNBIN 1 4.5760e+06 2.2094e+10 1436069
## - CIGLNBIN 1 1.0715e+07 2.2100e+10 1436102
## - CORES 7 2.0941e+07 2.2110e+10 1436145
## - MAGE 1 3.0461e+08 2.2394e+10 1437665
## - PLUR 1 4.9336e+08 2.2582e+10 1438658
## - MRACER 5 5.3936e+08 2.2628e+10 1438891
## - GEST 1 1.7058e+10 3.9147e+10 1503750
##
## Step: AIC=1436046
## BWTG ~ CORES + GEST + PLUR + MAGE + MRACER + CIGFNBIN + CIGSNBIN +
## CIGLNBIN
##
## Df Sum of Sq RSS AIC
## <none> 2.2089e+10 1436046
## - CIGSNBIN 1 3.2655e+06 2.2092e+10 1436061
## - CIGFNBIN 1 7.2300e+06 2.2096e+10 1436082
## - CIGLNBIN 1 1.0887e+07 2.2100e+10 1436102
## - CORES 7 2.0898e+07 2.2110e+10 1436143
## - MAGE 1 3.0753e+08 2.2397e+10 1437679
## - PLUR 1 4.9343e+08 2.2583e+10 1438657
## - MRACER 5 5.4018e+08 2.2629e+10 1438894
## - GEST 1 1.7058e+10 3.9147e+10 1503748
##
## Call:
## lm(formula = BWTG ~ CORES + GEST + PLUR + MAGE + MRACER + CIGFNBIN +
## CIGSNBIN + CIGLNBIN, data = used_data)
##
## Coefficients:
## (Intercept) CORES10 CORES13 CORES35 CORES59
## -3399.858 13.298 -16.079 34.409 49.629
## CORES60 CORES77 CORES86 GEST PLUR
## -39.388 18.970 13.713 174.510 -345.258
## MAGE MRACER1 MRACER3 MRACER4 MRACER6
## 8.709 142.340 66.780 12.524 53.053
## MRACER7 CIGFNBIN1 CIGSNBIN1 CIGLNBIN1
## 13.033 -57.790 -64.127 -102.957
The function regsubsets in leaps package select the subset of variables of fixed size by comparing \(R^2\) for all combinations of variables.
regfit_backward <- leaps::regsubsets(BWTG ~ ., data = used_data,
method="backward")
summary(regfit_backward)
## Subset selection object
## Call: regsubsets.formula(BWTG ~ ., data = used_data, method = "backward")
## 20 Variables (and intercept)
## Forced in Forced out
## CORES10 FALSE FALSE
## CORES13 FALSE FALSE
## CORES35 FALSE FALSE
## CORES59 FALSE FALSE
## CORES60 FALSE FALSE
## CORES77 FALSE FALSE
## CORES86 FALSE FALSE
## GEST FALSE FALSE
## PLUR FALSE FALSE
## MAGE FALSE FALSE
## MRACER1 FALSE FALSE
## MRACER3 FALSE FALSE
## MRACER4 FALSE FALSE
## MRACER6 FALSE FALSE
## MRACER7 FALSE FALSE
## MHISPU FALSE FALSE
## CIGPNBIN1 FALSE FALSE
## CIGFNBIN1 FALSE FALSE
## CIGSNBIN1 FALSE FALSE
## CIGLNBIN1 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## CORES10 CORES13 CORES35 CORES59 CORES60 CORES77 CORES86 GEST PLUR
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " " " " " "*" "*"
## 5 ( 1 ) " " " " " " " " " " " " " " "*" "*"
## 6 ( 1 ) " " " " " " " " "*" " " " " "*" "*"
## 7 ( 1 ) " " " " " " " " "*" " " " " "*" "*"
## 8 ( 1 ) " " " " " " " " "*" " " " " "*" "*"
## MAGE MRACER1 MRACER3 MRACER4 MRACER6 MRACER7 MHISPU CIGPNBIN1
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" "*" "*" " " " " " " " " " "
## CIGFNBIN1 CIGSNBIN1 CIGLNBIN1
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " "*"
## 4 ( 1 ) " " " " "*"
## 5 ( 1 ) " " " " "*"
## 6 ( 1 ) " " " " "*"
## 7 ( 1 ) "*" " " "*"
## 8 ( 1 ) "*" " " "*"
The function regsubsets chooses GEST, MRACER1, CIGLN1, PLUR, MAGE, CIGFN1, MRACER3, CORES26 as the eight most significant variables(from the most significant to least significant). Based on the exploratory work done by regsubsets, we will use these variables as predictors for now.
used_vars = c("GEST", "MRACER1", "CIGLNBIN1", "PLUR", "MAGE", "CIGFNBIN1", "MRACER3", "CORES26")
used_data = get.used.data(used_vars)
finalmodel <- lm(BWTG ~ GEST+MRACER+CIGLNBIN+PLUR+MAGE+CIGFNBIN+CORES, data = used_data)
summary(finalmodel)
##
## Call:
## lm(formula = BWTG ~ GEST + MRACER + CIGLNBIN + PLUR + MAGE +
## CIGFNBIN + CORES, data = used_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3401.0 -284.9 -20.5 262.4 4203.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3400.9044 25.8955 -131.332 < 2e-16 ***
## GEST 174.4692 0.5772 302.251 < 2e-16 ***
## MRACER1 145.4427 2.6236 55.435 < 2e-16 ***
## MRACER3 70.7545 10.7673 6.571 5.01e-11 ***
## CIGLNBIN1 -143.6183 8.7237 -16.463 < 2e-16 ***
## PLUR -345.5425 6.7185 -51.432 < 2e-16 ***
## MAGE 8.5391 0.2131 40.066 < 2e-16 ***
## CIGFNBIN1 -77.1101 8.0080 -9.629 < 2e-16 ***
## CORES26 26.1174 5.7367 4.553 5.30e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432.3 on 118305 degrees of freedom
## Multiple R-squared: 0.5106, Adjusted R-squared: 0.5106
## F-statistic: 1.543e+04 on 8 and 118305 DF, p-value: < 2.2e-16
[analyze the plot and state the fact that data are contaminated with outliers or influential observations]
# Diagonostic plots examining residuals, fitted values, Cook’s distance, and leverage
par(mfrow=c(2,2))
plot(finalmodel)
We observed points of high leverage and influence from the plots. We have decided that these data points are not data entry errors, neither they are from a different population than most of our data. So we have no compelling reason to exclude them from the analysis. Robust regression might be a good strategy since it is a compromise between excluding these points entirely from the analysis and including all the data points and treating all them equally in OLS regression. The idea of robust regression is to weigh the observations differently based on how well behaved these observations are. Roughly speaking, it is a form of weighted and reweighted least squares regression.
robustmodel <- rlm(BWTG ~ GEST+MRACER+CIGLNBIN+PLUR+MAGE+CIGFNBIN+CORES, data = used_data)
summary(robustmodel)
##
## Call: rlm(formula = BWTG ~ GEST + MRACER + CIGLNBIN + PLUR + MAGE +
## CIGFNBIN + CORES, data = used_data)
## Residuals:
## Min 1Q Median 3Q Max
## -3393.733 -273.729 -9.521 272.658 4285.721
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -3554.6777 25.3378 -140.2913
## GEST 177.7944 0.5648 314.7909
## MRACER1 145.7638 2.5671 56.7806
## MRACER3 63.8613 10.5354 6.0616
## CIGLNBIN1 -143.8171 8.5358 -16.8487
## PLUR -327.9673 6.5738 -49.8901
## MAGE 8.4012 0.2085 40.2869
## CIGFNBIN1 -73.7702 7.8356 -9.4148
## CORES26 28.4005 5.6131 5.0596
##
## Residual standard error: 405.1 on 118305 degrees of freedom
[It seems the result from robust regression does not differ too much from the result from OLS regression. What does that mean?]
Faraz: First of all, Naya has done an incredible amount of work so far. She’s really gone the extra mile here. I just want to point out that I don’t think it’s a good idea to drop covariates using regsubsets or p-values alone.
Consider that the method above eliminates categorical covariates indiscriminately. Unless I’m misunderstanding, entire race categories are dropped during the selection process. Also, using p-values specifically as a determinant feels like p-hacking.
I took a step back and added a few points of analysis that are useful.
A few decisions in this regard:
inspect_covariate = function(covariate) {
df = birth_data %>% group_by(!!covariate) %>% summarise(weight = mean(BWTG), se = sd(BWTG))
ggplot(df, aes_string(x=quo_name(covariate), y='weight')) +
geom_errorbar(aes(ymin=weight-se, ymax=weight+se), width=.1) +
geom_line() +
geom_point()
}
inspect_covariate(quo(PARITY))
## Warning: Removed 2 rows containing missing values (geom_errorbar).
inspect_covariate(quo(MAGE))
## Warning: Removed 1 rows containing missing values (geom_errorbar).
inspect_covariate(quo(RACE))
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
inspect_covariate(quo(HISP))
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
inspect_covariate(quo(MAGE))
## Warning: Removed 1 rows containing missing values (geom_errorbar).
inspect_covariate(quo(PLUR))
inspect_covariate(quo(SMOKER))
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
inspect_covariate(quo(CIGPN))
## Warning: Removed 9 rows containing missing values (geom_errorbar).
inspect_covariate(quo(CIGFN))
## Warning: Removed 4 rows containing missing values (geom_errorbar).
inspect_covariate(quo(CIGSN))
## Warning: Removed 4 rows containing missing values (geom_errorbar).
inspect_covariate(quo(CIGLN))
## Warning: Removed 11 rows containing missing values (geom_errorbar).
model = lm(BWTG ~ SEX + SMOKER + GEST + PLUR + MAGE^2 + RACE, data = birth_data)
plot(model, las=1)
summary(model)
##
## Call:
## lm(formula = BWTG ~ SEX + SMOKER + GEST + PLUR + MAGE^2 + RACE,
## data = birth_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3457.8 -281.1 -18.9 259.2 4186.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3198.0873 25.9647 -123.17 <2e-16 ***
## SEX2 -121.2423 2.4918 -48.66 <2e-16 ***
## SMOKERTRUE -134.4911 3.6442 -36.91 <2e-16 ***
## GEST 174.5041 0.5728 304.66 <2e-16 ***
## PLUR -338.8981 6.6574 -50.91 <2e-16 ***
## MAGE 8.2681 0.2133 38.76 <2e-16 ***
## RACEA -171.9756 6.7694 -25.41 <2e-16 ***
## RACEB -177.4937 3.0765 -57.69 <2e-16 ***
## RACEO -67.1727 3.7687 -17.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 428.4 on 118305 degrees of freedom
## Multiple R-squared: 0.5194, Adjusted R-squared: 0.5194
## F-statistic: 1.598e+04 on 8 and 118305 DF, p-value: < 2.2e-16
model = rlm(BWTG ~ SEX + SMOKER + GEST + PLUR + MAGE^2 + RACE, data = birth_data)
plot(model, las=1)
summary(model)
##
## Call: rlm(formula = BWTG ~ SEX + SMOKER + GEST + PLUR + MAGE^2 + RACE,
## data = birth_data)
## Residuals:
## Min 1Q Median 3Q Max
## -3451.186 -270.670 -8.232 270.438 4272.446
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -3354.2639 25.3548 -132.2933
## SEX2 -121.9549 2.4333 -50.1199
## SMOKERTRUE -133.0335 3.5586 -37.3837
## GEST 177.9024 0.5593 318.0619
## PLUR -321.0704 6.5010 -49.3878
## MAGE 8.1383 0.2083 39.0657
## RACEA -169.8854 6.6103 -25.7000
## RACEB -176.6605 3.0042 -58.8045
## RACEO -69.7102 3.6801 -18.9422
##
## Residual standard error: 401.3 on 118305 degrees of freedom